clear

vocabFilename = '..\data\vocab.txt'; 
csFilename = '..\data\cs.index.txt';
databaseFilename = '..\data\database.index.txt';
infotheoryFilename = '..\data\infotheory.index.txt'; 
infoRetFilename = '..\data\InfoRet.index.txt';
dataminingFilename = '..\data\datamining.index.txt';
cryptographyFilename = '..\data\cryptography.index.txt'; 
cryptanalysisFilename = '..\data\cryptanalysis.index.txt';
historycryptoFilename = '..\data\historycrypto.index.txt';


fid = fopen(vocabFilename);

vocab = [];
docFilename = [
    cellstr(csFilename);
    cellstr(databaseFilename);
    cellstr(infotheoryFilename);
    cellstr(infoRetFilename);
    cellstr(dataminingFilename);
    cellstr(cryptographyFilename);
    cellstr(cryptanalysisFilename);
    cellstr(historycryptoFilename);
];

while 1
    tline = fgetl(fid);
    if ~ischar(tline), break, end
    if strcmp(tline,''), continue, end
    tokens = regexp(tline,'&','split');
    vocab = [vocab; strtrim(tokens(2))];
end

fclose(fid);
documents = zeros( size(vocab,1), size(docFilename,1) );
for i=1:size(docFilename,1)
    fid = fopen(cell2mat(docFilename(i)));
    while 1
        tline = fgetl(fid);
        if ~ischar(tline), break, end
        tokens = regexp(tline,'&','split');
        term = str2num(cell2mat(tokens(1)));
        documents(term, i) = documents(term, i) + 1;
    end
    fclose(fid);
end

numDocs = size(documents,2);
numWords = size(documents,1);
numTopics = 2;

%P(w_j|z_k)
%w_z = ones(numWords, numTopics);
w_z = rand(numWords, numTopics);
for i=1:numTopics
    w_z(:,i) = w_z(:,i)/sum(w_z(:,i));
end
%P(z_k|d_i)
%z_d = ones(numTopics, numDocs);
z_d = rand(numTopics, numDocs);
for i=1:numDocs
    z_d(:,i) = z_d(:,i)/sum(z_d(:,i));
end

%P(z_k)
z = eye(numTopics);

%P(z_k|d_i,w_j)
z_dw = zeros(numTopics, numDocs, numWords);

convergence = 1e6;
while(convergence > 0.05)
%for iter=1:1
    for k=1:numTopics
        for i=1:numDocs
            for j=1:numWords
                z_dw(k,i,j) = (w_z(j,k) * z_d(k,i))/(w_z(j,1) * z_d(1,i) + w_z(j,2) * z_d(2,i));
            end
        end
    end
    
    w_z_old = w_z;
    for j=1:numWords
        for k=1:numTopics
            numerator = 0;
            for i=1:numDocs
                numerator = numerator + (documents(j,i) * z_dw(k,i,j));
            end
            denominator = 0;
            for m=1:numWords
                for i=1:numDocs
                    denominator = denominator + (documents(m,i) * z_dw(k,i,m));
                end
            end
            w_z(j,k) = numerator / denominator;
        end
    end
    
    convergence = sum(sum(abs(w_z - w_z_old)))
    for k=1:numTopics
        for i=1:numDocs
            numerator = 0;
            for j=1:numWords
                numerator = numerator + ( documents(j,i) * z_dw(k,i,j) );
            end
            z_d(k,i) = numerator / sum( documents(:,i) );
        end
    end
end

[dump index] = sort(w_z,'descend');

for k=1:numTopics
    topicWords(:,k) = vocab( index(:,k) );
end

topicWords(1:20,:)